package com.zhida.gooutcrawler.spider.processor;

import com.zhida.gooutcrawler.entity.Article;
import com.zhida.gooutcrawler.entity.Page;
import com.zhida.gooutcrawler.spider.processor.IContentProcessor;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 采集列表
 * Created by Administrator on 2017-04-05.
 */
public class ContentProcessor implements IContentProcessor {

    private String column;

    public ContentProcessor(String column) {
        this.column = column;
    }

    public void parse(Page page) {
        String html = page.getRawContent();
        Document doc = Jsoup.parse(html);

        //标题
        String title = doc.select("#artitle").text();
//        String time = doc.select("#arsource").select("td[align=center]").text();
        String html1 = doc.select("#zoom").html();
        String time;

        Pattern r = Pattern.compile("var tm = \"([\\d\\s\\-:]*)?\";");
        Matcher m = r.matcher(html);
        if (m.find()) {
            time = m.group(1);
        } else {
            throw new IllegalArgumentException("时间解析错误");
        }

                            /*while(m.find()) {
                            }*/
        // 分组0得到的是整个原字符串
                            /*for (int i = 0; i < m.groupCount() + 1; i++) {
                                System.out.println("分组" + i + ":" + m.group(i));
                            }*/

        Article article = new Article();
        article.setTitle(title);
        article.setContent(html1);
        article.setUrl(page.getUrl());
        article.setColumn(column);
        try {
            Date date = new SimpleDateFormat("yyyy-MM-dd HH:mm").parse(time);
            article.setTime(date);
        } catch (ParseException e) {
            e.printStackTrace();
        }

        System.out.println("title:" + title);
        System.out.println("time:" + time);
//        System.out.println("html:" + html1);

        page.setData(article);

    }

}
